#include "common_cuda.h"

//update the positions
__global__ void updatePos(float4* pos, float4* vel, int n, float dt)
{
    int x = blockIdx.x*blockDim.x + threadIdx.x;

    if (x < n)
        pos[x] += dt*vel[x]; // 3 flops
}
